{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import joblib"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(33465, 6705)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "input_dir = '../../../preprocess_data_new/'\n",
    "data_date = joblib.load(input_dir + 'train_ax_date.lz4')[:33465]\n",
    "data_nodup =joblib.load(input_dir + 'train_ax_nodup.lz4').drop(columns=['id','loan_dt'])[:33465]\n",
    "data_null = joblib.load(input_dir + 'train_ax_null.lz4')[:33465]\n",
    "# 为了迁就valid中tag特征append到最后的格式\n",
    "data_tag = data_nodup[['tag']] \n",
    "data_nodup = data_nodup.drop(columns=['tag'])\n",
    "data = pd.concat([data_date,data_nodup,data_null,data_tag],axis=1)\n",
    "\n",
    "x = data.fillna(-1).values\n",
    "data_label = joblib.load(input_dir + 'train_y_33465.lz4')\n",
    "y = data_label['label'].values\n",
    "feature_names = list(data.columns)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### fscore_33465"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n",
       "       colsample_bytree=1, early_stopping_rounds=100, eval_metric='auc',\n",
       "       gamma=1, learning_rate=0.05, max_delta_step=0, max_depth=3,\n",
       "       min_child_weight=2, missing=None, n_estimators=220, n_jobs=24,\n",
       "       nthread=None, objective='binary:logistic', random_state=0,\n",
       "       reg_alpha=0, reg_lambda=1, scale_pos_weight=14.22520473157416,\n",
       "       seed=2018, silent=False, subsample=0.9)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from xgboost import XGBClassifier\n",
    "xgb = XGBClassifier(n_estimators = 220,\n",
    "                      max_depth = 3,\n",
    "                      min_child_weight = 2,\n",
    "                      learning_rate =0.05, \n",
    "                      subsample = 0.9,\n",
    "                      booster='gbtree',\n",
    "                      objective='binary:logistic',\n",
    "                      early_stopping_rounds=100,\n",
    "                      scale_pos_weight=float(len(y)-np.sum(y))/float(np.sum(y)),\n",
    "                      eval_metric='auc',\n",
    "                      gamma=1,\n",
    "                      reg_lambda=1,\n",
    "                      seed=2018,\n",
    "                      silent=False,\n",
    "                      n_jobs=24\n",
    "                             )\n",
    "xgb.fit(x,y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['feature_names']"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "joblib.dump(xgb,'xgb_model')\n",
    "joblib.dump(data.columns,'feature_names')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
